Link to paper on dataset: http://faculty.washington.edu/kenrice/heartgraphs/effectivegraphs.pdf
Dataset url: http://faculty.washington.edu/kenrice/heartgraphs/
install.packages("readr")
install.packages("ggplot2")
install.packages("ggbeeswarm")
install.packages("dplyr")
install.packages("MASS")
install.packages("hexbin")
library(readr)
library(ggplot2)
library(ggbeeswarm)
library(dplyr)
library(MASS)
library(hexbin)
heart <- read_csv("http://faculty.washington.edu/kenrice/heartgraphs/nhaneslarge.csv", na=".") #na= tells R that . is an na value
head(heart)
ggplot code makes graphs by layering imformation on top of an empty plot
ggplot(heart, aes(x=DR1TFOLA))
#This says we want ggplot to use the data.frame heart and to plot DR1TFOLA on the x-axis.
create a histogram by adding a geom layer and using the “+” sign
ggplot(heart, aes(x=DR1TFOLA)) +
geom_histogram()
change x-axis label from “DR1TFOLA” to “Folate Intake”
ggplot(heart, aes(x=DR1TFOLA)) +
geom_histogram() +
labs(x = "Folate intake") #x-axis label
change outline color
ggplot(heart, aes(x=DR1TFOLA)) +
geom_histogram(colour = "white") +
labs(x = "Folate intake")
change the fill of the bars
ggplot(heart, aes(x=DR1TFOLA)) +
geom_histogram(colour = "white", fill = "peachpuff") + #yes, that is a name of a color
labs(x = "Folate intake")
language will be different for each type of graph so make sure to reference your documentation by typing “?command”
?geom_histogram
All of our histogram plots have given us this warning:
#`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This warning tells us that this is a default that ggplot chose for us given the data it has.
change bin size
ggplot(heart, aes(x=DR1TFOLA)) +
geom_histogram(colour = "white", fill = "peachpuff", bins = 50)
add facet_wrap for split plots based on a variable
ggplot(heart, aes(x = DR1TFOLA)) +
geom_histogram(colour = "white", fill = "peachpuff", bins = 50) +
labs(x = "Folate intake") +
facet_wrap(~gender)
change the geom_ histogram layer to geom_density
ggplot(heart, aes(x = DR1TFOLA)) +
geom_density(colour = "white", fill = "peachpuff") + #Make sure to take out bin=50
labs(x = "Folate intake") +
facet_wrap(~gender)
plot all three variables from last plot (folate, density, and gender) on one by using color to differentiate gender
ggplot(heart, aes(x = DR1TFOLA)) +
geom_density(aes(colour = gender)) + #Notice moving the colour into aesthetics
labs(x = "Folate intake")
create a stripchart of BPXSAR(or systolic blood pressure) by gender
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_point()
add an alpha in the geom layer to make our points more transparent
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_point(alpha = .1)
add jitter to space out our points
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_jitter(alpha = .1)
Notes about Jitter:
jitter automatically adds space (noise) to both the height and the width of your plots
which variables won’t be effected by noice - categorical
change jitter to width only
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_jitter(alpha = .1, width = .5, height = 0)
change our x- and y-axis labels
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_jitter(alpha = .1, width = .5, height = 0) +
labs(x = "", y = "Systolic BP (mmHg)")
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_beeswarm(alpha = .2) +
labs(x = "Systolic BP (mmHg)", y = "")
add statistics on top of your plots with stat_summary
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_beeswarm(alpha = .2) +
stat_summary(fun.y = "mean", geom = "point", colour = "orange") +
labs(x = "Systolic BP (mmHg)", y = "")
add other plot types as geom layers
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_boxplot(outlier.shape = NA) + #outlier.shape=NA tells boxplot not to plot outliers
geom_beeswarm(alpha = .2) +
labs(x = "Systolic BP (mmHg)", y = "")
order is important (will change the order things are plotted)
ggplot(heart, aes(x = gender, y = BPXSAR)) +
geom_beeswarm(alpha = .2) +
geom_boxplot(outlier.shape = NA) +
labs(x = "Systolic BP (mmHg)", y = "")
If that was easy for you, give this a shot. Try and recreate the plots below using what you learned above.
Hint: We are using “geom_violin”.
Try adding some statistics to your plot. You can start with the mean and the median.
Try adding another geom layer.
Hint: Do you want to include your outlier points?
Simple scatterplot
ggplot(heart, aes(x = RIDAGEYR, y = BPXSAR)) +
geom_point() +
labs(x = "Age (years)", y = "Systolic BP (mmHg)")
If you have big n, try hexbin plot
ggplot(heart, aes(x = RIDAGEYR, y = BPXSAR)) +
geom_hex() +
labs(x = "Age (years)", y = "Systolic BP (mmHg)")
Add linear regression line with SE
ggplot(heart, aes(x = RIDAGEYR, y = BPXSAR)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Age (years)", y = "Systolic BP (mmHg)")
Default is loess line
ggplot(heart, aes(x = RIDAGEYR, y = BPXSAR)) +
geom_point() +
geom_smooth() +
labs(x = "Age (years)", y = "Systolic BP (mmHg)")
Add splines
library(splines)
library(MASS)
ggplot(heart, aes(x = RIDAGEYR, y = BPXSAR)) +
geom_point() +
stat_smooth(method = "lm", formula = y ~ ns(x, 3)) +
labs(x = "Age (years)", y = "Systolic BP (mmHg)")
Just copy this:
library(dplyr)
heart2 <- heart %>%
mutate(age_cat = cut(RIDAGEYR,c(0,30,55,100)))
Recreate theirs first
ggplot(heart2, aes(x = BMXBMI, y = BPXSAR)) +
geom_point() +
stat_smooth(aes(colour = gender), method = "lm") +
facet_wrap(~age_cat) +
labs(x = "Body Mass Index"~(kg/m^2), y = "Systolic BP (mmHg)")
Try with facet grid, update labels
ggplot(heart2, aes(x = BMXBMI, y = BPXSAR)) +
geom_point() +
stat_smooth(aes(colour = gender), method = "lm") +
facet_grid(gender~age_cat) +
labs(x = "Body Mass Index"~(kg/m^2), y = "Systolic BP (mmHg)")
Play with colors!
ggplot(heart2, aes(x = BMXBMI, y = BPXSAR, colour = gender)) +
geom_point(alpha = .5) +
stat_smooth(method = "lm") +
facet_grid(gender~age_cat) +
theme_minimal() +
labs(x = "Body Mass Index"~(kg/m^2), y = "Systolic BP (mmHg)") +
scale_color_manual(values = c("#B47CC7", "#D65F5F"), guide = FALSE)
my_colors <- c("#C4AD66", "#77BEDB")
ggplot(heart2, aes(x = BMXBMI, y = BPXSAR, colour = gender)) +
geom_point(alpha = .5) +
stat_smooth(method = "lm") +
facet_grid(gender~age_cat) +
labs(x = "Body Mass Index"~(kg/m^2), y = "Systolic BP (mmHg)") +
scale_color_manual(values = my_colors, guide = FALSE)